{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# From loops and conditionals, to working with data files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 4 9 16 25 "
     ]
    }
   ],
   "source": [
    "# reviewing looping through a list\n",
    "my_list = [1, 2, 3, 4, 5]\n",
    "for value in my_list:\n",
    "    print(value ** 2, end=' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 4 9 16 25 "
     ]
    }
   ],
   "source": [
    "# reviewing range() to loop a set number of times\n",
    "for value in range(5):\n",
    "    print((value + 1) ** 2, end=' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "found it\n"
     ]
    }
   ],
   "source": [
    "# testing values as we loop through a list\n",
    "my_value = 2\n",
    "for value in range(5):\n",
    "    if value == my_value:\n",
    "        print('found it')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "found it\n"
     ]
    }
   ],
   "source": [
    "# encapsulate with a function and find if some value appears in some list\n",
    "def find_value(my_value, my_list):\n",
    "    is_found = False\n",
    "    for value in my_list:\n",
    "        if value == my_value:\n",
    "            is_found = True\n",
    "    return is_found\n",
    "\n",
    "# now use our function\n",
    "user_id = 3\n",
    "user_ids = [1, 2, 3, 4, 5, 6]\n",
    "if find_value(user_id, user_ids):\n",
    "    print('found it')\n",
    "else:\n",
    "    print('did not find it')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "TypeError",
     "evalue": "unsupported operand type(s) for ** or pow(): 'range' and 'int'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-5-29289dd35655>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# what if we want to raise all the numbers in a list to the 2nd power?\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m10\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m**\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;31mTypeError\u001b[0m: unsupported operand type(s) for ** or pow(): 'range' and 'int'"
     ]
    }
   ],
   "source": [
    "# what if we want to raise all the numbers in a list to the 2nd power?\n",
    "range(10) ** 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# oops, error! instead, use list comprehension: do the operation element-wise\n",
    "[ x ** 2 for x in range(10) ]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Now we will use these same loops and conditionals to work with a data file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# import python's csv module to open a csv file\n",
    "import csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['neighborhood', 'price', 'bedrooms', 'date', 'sqft', 'longitude', 'latitude']\n",
      "['foster city', '2495', '1', '11/14/2014 12:26', '755', '-122.27', '37.5538']\n",
      "['palo alto', '2695', '', '11/14/2014 12:25', '443', '-122.161524', '37.450289']\n",
      "['brisbane', '3150', '2', '11/14/2014 12:24', '1242', '-122.417912', '37.692415']\n",
      "['palo alto', '2800', '2', '11/14/2014 12:24', '', '', '']\n",
      "['san mateo', '2196', '1', '11/14/2014 12:24', '676', '-122.2998', '37.5395']\n",
      "['santa clara', '3264', '3', '11/14/2014 12:28', '1138', '', '']\n",
      "['san jose south', '2000', '2', '11/14/2014 12:28', '822', '-121.902268', '37.253503']\n",
      "['sunnyvale', '4740', '3', '11/14/2014 12:28', '1406', '-122.034683', '37.368445']\n",
      "['inner sunset / UCSF', '3395', '2', '11/14/2014 12:32', '', '-122.479345', '37.764582']\n",
      "['richmond / seacliff', '2699', '1', '11/14/2014 12:32', '', '-122.503781', '37.7718']\n",
      "['SOMA / south beach', '3620', '1', '11/14/2014 12:30', '860', '-122.395195', '37.775133']\n",
      "['dublin / pleasanton / livermore', '2025', '1', '11/14/2014 12:18', '636', '-121.787665', '37.67963']\n",
      "['concord / pleasant hill / martinez', '', '2', '11/14/2014 12:18', '1019', '-122.035275', '37.975259']\n",
      "['hercules, pinole, san pablo, el sob', '1795', '1', '11/14/2014 12:17', '715', '-122.321672', '37.978086']\n",
      "['corte madera', '4299', '3', '11/14/2014 12:33', '1533', '-122.488541', '37.919834']\n"
     ]
    }
   ],
   "source": [
    "# with guarantees the file will be closed even if we hit an exception\n",
    "# rt means 'read' and 'text mode'\n",
    "with open('rents_raw.csv', 'r', encoding='utf-8', newline=None) as csvfile:\n",
    "    my_csv = csv.reader(csvfile)\n",
    "    for row in my_csv:\n",
    "        print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['neighborhood', 'price', 'bedrooms', 'date', 'sqft', 'longitude', 'latitude']\n"
     ]
    }
   ],
   "source": [
    "# the column headers are the first row in the data file\n",
    "# use next to iterate our csv reader to the first row to grab the headers\n",
    "with open('rents_raw.csv', 'r', encoding='utf-8', newline=None) as csvfile:\n",
    "    my_csv = csv.reader(csvfile)\n",
    "    headers = next(my_csv)\n",
    "    print(headers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'price'"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# what is the 1st column (zero-indexed) in our data set?\n",
    "headers[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "price\n",
      "2495\n",
      "2695\n",
      "3150\n",
      "2800\n",
      "2196\n",
      "3264\n",
      "2000\n",
      "4740\n",
      "3395\n",
      "2699\n",
      "3620\n",
      "2025\n",
      "\n",
      "1795\n",
      "4299\n"
     ]
    }
   ],
   "source": [
    "# for each row in the data set, print the price column's value\n",
    "with open('rents_raw.csv', 'r', encoding='utf-8', newline=None) as csvfile:\n",
    "    my_csv = csv.reader(csvfile)\n",
    "    for row in my_csv:\n",
    "        print(row[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['price',\n",
       " '2495',\n",
       " '2695',\n",
       " '3150',\n",
       " '2800',\n",
       " '2196',\n",
       " '3264',\n",
       " '2000',\n",
       " '4740',\n",
       " '3395',\n",
       " '2699',\n",
       " '3620',\n",
       " '2025',\n",
       " '',\n",
       " '1795',\n",
       " '4299']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create a new list to contain the column of prices in the data set\n",
    "prices = []\n",
    "with open('rents_raw.csv', 'r', encoding='utf-8', newline=None) as csvfile:\n",
    "    my_csv = csv.reader(csvfile)\n",
    "    for row in my_csv:\n",
    "        prices.append(row[1])  \n",
    "prices"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This list has a couple of problems. First, it includes the header. Second, it's all strings even though prices are numeric data. Third, it contains some empty strings. We'll have to clean it up."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['2495',\n",
       " '2695',\n",
       " '3150',\n",
       " '2800',\n",
       " '2196',\n",
       " '3264',\n",
       " '2000',\n",
       " '4740',\n",
       " '3395',\n",
       " '2699',\n",
       " '3620',\n",
       " '2025',\n",
       " '',\n",
       " '1795',\n",
       " '4299']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# to remove the first element of the list, we can just capture position 1 through the end of the list\n",
    "prices = prices[1:]\n",
    "prices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2495\n",
      "2695\n",
      "3150\n",
      "2800\n",
      "2196\n",
      "3264\n",
      "2000\n",
      "4740\n",
      "3395\n",
      "2699\n",
      "3620\n",
      "2025\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "could not convert string to float: ",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-14-e7def979dcd8>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# now let's convert the price strings to integers\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mprice\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mprices\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m     \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfloat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mprice\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m: could not convert string to float: "
     ]
    }
   ],
   "source": [
    "# now let's convert the price strings to integers\n",
    "for price in prices:\n",
    "    print(int(float(price)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2495\n",
      "2695\n",
      "3150\n",
      "2800\n",
      "2196\n",
      "3264\n",
      "2000\n",
      "4740\n",
      "3395\n",
      "2699\n",
      "3620\n",
      "2025\n",
      "None\n",
      "1795\n",
      "4299\n"
     ]
    }
   ],
   "source": [
    "# you can't convert an empty string to a numeric type\n",
    "for price in prices:\n",
    "    if not price == '':\n",
    "        print(int(float(price)))\n",
    "    else:\n",
    "        print('None')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# encapsulate this functionality inside a new function\n",
    "def extract_int_price(price):\n",
    "    if not price == '':\n",
    "        return int(float(price))\n",
    "    else:\n",
    "        return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2495\n",
      "2695\n",
      "3150\n",
      "2800\n",
      "2196\n",
      "3264\n",
      "2000\n",
      "4740\n",
      "3395\n",
      "2699\n",
      "3620\n",
      "2025\n",
      "None\n",
      "1795\n",
      "4299\n"
     ]
    }
   ],
   "source": [
    "# use our function to convert each element in the list of prices to an integer\n",
    "for price in prices:\n",
    "    print(extract_int_price(price))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2495, 2695, 3150, 2800, 2196, 3264, 2000, 4740, 3395, 2699, 3620, 2025, None, 1795, 4299]\n"
     ]
    }
   ],
   "source": [
    "# rather than just printing each converted value, turn it into a new list called int_prices\n",
    "int_prices = []\n",
    "for price in prices:\n",
    "    int_prices.append(extract_int_price(price))\n",
    "print(int_prices)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2495, 2695, 3150, 2800, 2196, 3264, 2000, 4740, 3395, 2699, 3620, 2025, None, 1795, 4299]\n"
     ]
    }
   ],
   "source": [
    "# do the same thing, using list comprehension\n",
    "int_prices = [ extract_int_price(price) for price in prices ]\n",
    "print(int_prices)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Now let's clean up our neighborhood names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "foster city\n",
      "palo alto\n",
      "brisbane\n",
      "palo alto\n",
      "san mateo\n",
      "santa clara\n",
      "san jose south\n",
      "sunnyvale\n",
      "inner sunset - UCSF\n",
      "richmond - seacliff\n",
      "SOMA - south beach\n",
      "dublin - pleasanton - livermore\n",
      "concord - pleasant hill - martinez\n",
      "hercules, pinole, san pablo, el sob\n",
      "corte madera\n"
     ]
    }
   ],
   "source": [
    "# replace any forward slashes in neighborhood name with a hyphen\n",
    "with open('rents_raw.csv', 'r', encoding='utf-8', newline=None) as csvfile:\n",
    "    my_csv = csv.reader(csvfile)\n",
    "    next(my_csv) #skip the header row\n",
    "    for row in my_csv:\n",
    "        print(row[0].replace('/', '-')) #use string.replace() method"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create a new data set with cleaned up variables"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "first create a couple of functions to do the cleaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# create a new function to convert bedrooms from a string to an int\n",
    "def extract_int_bedrooms(bedrooms):\n",
    "    if not bedrooms == '':\n",
    "        return int(float(bedrooms))\n",
    "    else:\n",
    "        return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# create a new function to replace forward slashes and commas with hyphens\n",
    "def clean_neighborhood(neighborhood_name):\n",
    "    # you can daisy chain multiple string.replace() methods\n",
    "    return neighborhood_name.replace('/', '-').replace(',', '')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['foster city', 2495, 1],\n",
       " ['palo alto', 2695, None],\n",
       " ['brisbane', 3150, 2],\n",
       " ['palo alto', 2800, 2],\n",
       " ['san mateo', 2196, 1],\n",
       " ['santa clara', 3264, 3],\n",
       " ['san jose south', 2000, 2],\n",
       " ['sunnyvale', 4740, 3],\n",
       " ['inner sunset - UCSF', 3395, 2],\n",
       " ['richmond - seacliff', 2699, 1],\n",
       " ['SOMA - south beach', 3620, 1],\n",
       " ['dublin - pleasanton - livermore', 2025, 1],\n",
       " ['concord - pleasant hill - martinez', None, 2],\n",
       " ['hercules pinole san pablo el sob', 1795, 1],\n",
       " ['corte madera', 4299, 3]]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# clean the data set by calling the cleaning functions and save the results to variables\n",
    "rentals_cleaned = []\n",
    "with open('rents_raw.csv', 'r', encoding='utf-8', newline=None) as csvfile:\n",
    "    my_csv = csv.reader(csvfile)\n",
    "    next(my_csv)\n",
    "    for row in my_csv:\n",
    "        neighborhood_cleaned = clean_neighborhood(row[0])\n",
    "        price_cleaned = extract_int_price(row[1])\n",
    "        bedrooms_cleaned = extract_int_bedrooms(row[2])\n",
    "        rentals_cleaned.append([neighborhood_cleaned, price_cleaned, bedrooms_cleaned])      \n",
    "\n",
    "# display our nested lists of data        \n",
    "rentals_cleaned"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## What are the mean and max rainfall amounts?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Use rain.csv to calculate mean and maximum values in a column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3.14159"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use round to round a number to a specified precision\n",
    "round(3.14159265359, 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mean: 3.7 inches\n",
      "max: 5.9 inches\n"
     ]
    }
   ],
   "source": [
    "with open('rain.csv', 'r', encoding='utf-8', newline=None) as csvfile:\n",
    "    \n",
    "    # initialize a counter and variables to contain our descriptive stats\n",
    "    count = 0 #at the end, divide cumulative_sum by this to get the mean\n",
    "    cumulative_sum = 0 #our rolling sum\n",
    "    max_value = -1 #pick a really small number that's guaranteed to be less than the max\n",
    "    \n",
    "    # open the file and skip the header row\n",
    "    my_csv = csv.reader(csvfile)\n",
    "    next(my_csv)\n",
    "    \n",
    "    # loop through each data row\n",
    "    for row in my_csv:\n",
    "        \n",
    "        # rainfall amount is in column 1, only process this row's value if not an empty string\n",
    "        if not row[1] == '':\n",
    "            \n",
    "            # increment the counter and extract this row's rainfall as a float\n",
    "            count = count + 1\n",
    "            rainfall = float(row[1])\n",
    "            \n",
    "            # add this row's rainfall to the cumulative sum\n",
    "            cumulative_sum = cumulative_sum + rainfall\n",
    "            \n",
    "            # if this row's rainfall is greater than the current max value, update with the new max\n",
    "            if rainfall > max_value:\n",
    "                max_value = rainfall\n",
    "\n",
    "    # after looping through all the rows, divide the cumulative sum by the count and round to get the mean\n",
    "    mean_value = round(cumulative_sum / count, 1)\n",
    "    \n",
    "    # print out the mean and max values\n",
    "    print('mean:', mean_value, 'inches')\n",
    "    print('max:', max_value, 'inches')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "Now, how would you calculate the minimum?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
